Prerequisite libraries:
#install.packages('R.utils', repos = "http://cran.us.r-project.org")
#install.packages("treemapify", repos = "http://cran.us.r-project.org")
library(data.table)
library(tidyr)
library(ggplot2)
library(dplyr)
library(treemapify)
library(ggridges)
library(DT)
library(plotly)
colors <- fread("rebrickable/colors.csv.gz")
inventory_parts <- fread("rebrickable/inventory_parts.csv.gz")
inventories <- fread("rebrickable/inventories.csv.gz")
sets <- fread("rebrickable/sets.csv.gz")
themes <- fread("rebrickable/themes.csv.gz")
part_categories <- fread("rebrickable/part_categories.csv.gz")
parts <- fread("rebrickable/parts.csv.gz")
background_color <- "#2d2d2d"
font_color <- "white"
tick_color <- "#DDDDDD"
title_size <- 20
label_size <- 16
tick_size <- 12
grid_size <- 0.5
code_chunk_length <- 9.67
orange <- "#FF7E67"
coral <- "#FFFFC0"
teal <- "#40C0C0"
In this section we will explore data regarding themes.
parents <- themes %>% filter(is.na(parent_id)) %>% group_by(id, name)
find_parent <- function(parent_id) {
if (is.na(parent_id)) {
"parent"
} else {
prev_id <- parent_id
while(!is.na(parent_id)){
prev_id <- parent_id
parent_id <- themes$parent_id[themes$id==parent_id]
}
parents$name[parents$id==prev_id]
}
}
set_themes_children <- rename(sets, set_name=name) %>%
merge(rename(themes, theme_name=name), by.x="theme_id", by.y="id")
set_themes_children$parent_name <- lapply(set_themes_children$parent_id, find_parent)
set_themes_children <- set_themes_children %>%
mutate(parent_name = ifelse(parent_name=="parent", theme_name, parent_name))
Top 5 cumsum
merged_df <- merge(sets, themes, by.x = "theme_id", by.y = "id", all.x = TRUE)
selected_years <- c(1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020)
popular_themes <- merged_df %>%
count(name.y, sort = TRUE) %>%
head(5)
popular_themes <- popular_themes$name.y[1:5] # use the top 10 popular themes
merged_df_filtered <- merged_df[merged_df$name.y %in% popular_themes, ]
theme_year_counts <- merged_df_filtered %>%
group_by(name.y, year) %>%
summarise(num_sets = n()) %>%
ungroup()
## `summarise()` has grouped output by 'name.y'. You can override using the
## `.groups` argument.
theme_year_counts_cumsum <- theme_year_counts %>%
group_by(name.y) %>%
mutate(cum_sum = cumsum(num_sets))
ggplot(data = theme_year_counts_cumsum, aes(x = year, y = cum_sum, color = name.y)) +
geom_line(size = 1) +
scale_x_continuous(breaks = selected_years) +
labs(x = "Year", y = "Number of Sets", color = "Theme") +
ggtitle("Cumulative Sum of Sets for Chosen Themes")
Star wars
star_wars_counts <- theme_year_counts %>% filter(name.y == "Star Wars")
film_releases=c(1999,2002,2005,2015,2017,2019)
film_colors=c("red", "orange", "green", "blue", "purple", "brown")
film_labels=c("The Phantom Menace", "Attack of the Clones", "Revenge of the Sith",
"The Force Awakens", "The Last Jedi", "The Rise of Skywalker")
ggplot(data = star_wars_counts, aes(x = year, y = num_sets)) +
geom_line(size = 1) +
scale_x_continuous(breaks = selected_years) +
labs(x = "Year", y = "Number of Sets") +
ggtitle("Number of Produced Star Wars Sets") +
geom_vline(xintercept = film_releases, color = film_colors, linetype="dotted") +
annotate("text", x = film_releases[1:3]+0.2, y = max(star_wars_counts$num_sets)*0.4,
label = film_labels[1:3], color = film_colors[1:3], angle = 90, hjust = -0.2, size=3) +
annotate("text", x = film_releases[4:6]+0.2, y = 0, label = film_labels[4:6],
color = film_colors[4:6], angle = 90, hjust = -0.2, size=3) +
theme(legend.position = "none")
Table of parent themes
set_themes <- sets %>%
rename(set_name=name) %>%
merge(rename(themes, theme_name=name), by.x = "theme_id", by.y = "id")
set_theme_inventories <- inventories %>% merge(set_themes, by="set_num")
set_theme_inventories_parts <- inventory_parts %>% merge(set_theme_inventories, by.x="inventory_id", by.y="id")
set_theme_inventories_parts_colors <- colors %>% rename(color_name=name) %>%
merge(set_theme_inventories_parts, by.x="id", by.y="color_id") %>% rename(color_id=id)
theme_colors <- set_theme_inventories_parts_colors %>% filter(is.na(parent_id)) %>%
select(one_of(c("color_name","rgb","theme_name","theme_id","color_id")))
grouped_theme_colors <- theme_colors %>% group_by(theme_name, color_name, color_id, rgb) %>%
summarise(count=n(), .groups="keep") %>% arrange(theme_name, desc(count))
prettyTable <- function(table_df, round_columns_func=is.numeric, round_digits=0) {
DT::datatable(table_df, style="bootstrap", filter = "top", rownames = FALSE, extensions = "Buttons",
options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print'))) %>%
formatRound(unlist(lapply(table_df, round_columns_func)), round_digits)
}
prettyTable(grouped_theme_colors)
merged_df <- merge(sets, themes, by.x = "theme_id", by.y = "id", all.x = TRUE)
merged_df_inventories <- merge(inventories, merged_df, by = "set_num", all.x = TRUE)
inventory_parts <- inventory_parts %>% rename(id = inventory_id)
part_categories <- inventory_parts %>% rename(part_cat_id = id)
merged_df_inventory_parts <- merge(merged_df_inventories, inventory_parts, by="id")
merged_df_inventory_parts_final <- merge(merged_df_inventory_parts, parts, by = "part_num")
parts_count <- merged_df_inventory_parts_final %>% count(part_num, name = "count")
merged_counts <- merge(merged_df_inventory_parts_final, parts_count, by = "part_num", all.x=T)
Unique parts in years
result <- merged_counts %>%
filter(count == 1)
result<-result %>%
count(year)
result<-na.omit(result)
ggplot(result, aes(x = year, y = n)) +
geom_line() +
labs(x = "Year", y = "Number of Unique Lego Parts") +
ggtitle("Number of Unique Lego Blocks per Year")
Unique cumsum
result_cumsum <- result %>%
arrange(year) %>%
mutate(n_cumsum = cumsum(n))
ggplot(result_cumsum, aes(x = year, y = n_cumsum)) +
geom_line() +
labs(x = "Year", y = "Number of Rare Lego Parts") +
ggtitle("Cumulative Sum of Rare Lego Parts through Years")
Unique vs average in a theme
result <- merged_counts
theme_counts <- result %>%
group_by(name.y) %>%
summarize(num_parts = mean(num_parts), count = sum(count)) %>%
arrange(desc(count))
theme_counts<-head(arrange(theme_counts,desc(count)),100)
theme_counts <- theme_counts %>% rename('Average Number of Parts' = num_parts) %>%
rename('Number of Rare Lego Parts'=count)
ggplotly(ggplot(theme_counts, aes(x = theme_counts$'Average Number of Parts',
y = theme_counts$'Number of Rare Lego Parts',
text = name.y)) +
geom_point() +
labs(x = "Average Number of Parts", y = "Number of Rare Lego Parts") +
ggtitle("Number of Rare Lego Parts vs Average Number of Parts for a Theme"))
Average parts per set
merged_df <- merge(sets, themes, by.x = "theme_id", by.y = "id", all.x = TRUE)
avg_parts_per_year <- merged_df %>%
group_by(year) %>%
summarize(avg_parts = mean(num_parts))
ggplot(data = avg_parts_per_year, aes(x = year, y = avg_parts)) +
geom_line(size = 1) +
labs(x = "Year", y = "Average Number of Parts per Set") +
ggtitle("Average Number of Parts per Set per Year")
20 themes with the highest amount of unique lego parts
result_2 <- merged_counts %>%
filter(count == 1)
result_2<-result_2 %>%
count(name.y)
result_2<-na.omit(result_2)
result_2<-arrange(result_2, desc(n))
result_2 <- filter(result_2, name.y != "Database Sets")
result_2<-head(result_2,20)
# Excluding Database Sets
ggplot(result_2, aes(x = n, y = reorder(name.y, n))) +
geom_col(fill = coral) +
labs(title = "20 Themes with the Highest Amount of Unique LEGO Parts",
x = "Number of Unique Parts",
y = "Theme")+
theme(plot.background = element_rect(fill=background_color),
plot.title=element_text(size=title_size, colour = font_color, hjust = 4),
axis.title.x = element_text(size=label_size, colour = font_color, hjust=0.2),
axis.title.y = element_text(size=label_size, colour = font_color),
axis.text = element_text(size=tick_size, color = tick_color),
panel.grid = element_line(color="#DDDDDD"),
panel.background = element_rect(fill=background_color))
Average and maximum parts per theme
theme_part_counts <- merged_df %>%
group_by(name.y) %>%
summarize(avg_parts = mean(num_parts))
# Append the maximum number of parts found in a set with a given theme
theme_part_counts <- theme_part_counts %>%
left_join(merged_df %>%
group_by(name.y) %>%
summarize(max_parts = max(num_parts)),
by = "name.y")
top_20_avg_parts <- head(theme_part_counts %>% arrange(desc(avg_parts)),20)
top_20_max_parts <- head(theme_part_counts %>% arrange(desc(max_parts)),20)
# Second Plot
# Version A with Dots
ggplot(top_20_avg_parts, aes(x = reorder(name.y, avg_parts), y = avg_parts)) +
geom_bar(stat = "identity", fill=coral) +
coord_flip() +
labs(x = "Theme", y = "Average Number of Parts") +
geom_point(aes(reorder(name.y, avg_parts),y=max_parts), color=background_color,
fill= coral, shape=21, size=3, stroke=1.5) +
ggtitle("Average and Maximum Number of Parts in Selected Themes") +
theme(plot.background = element_rect(fill=background_color),
plot.title=element_text(size=title_size, colour = font_color, hjust = 4),
axis.title.x = element_text(size=label_size, colour = font_color, hjust=0.2),
axis.title.y = element_text(size=label_size, colour = font_color),
axis.text = element_text(size=tick_size, color = tick_color),
panel.grid = element_line(color="#DDDDDD"),
panel.background = element_rect(fill=background_color))
num_sets_per_year <- table(sets$year)
num_sets_per_year_df <- as.data.frame(num_sets_per_year)
colnames(num_sets_per_year_df) <- c("year", "num_sets")
selected_years <- c(1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020)
num_sets_per_year_df_new <- head(num_sets_per_year_df,-1)
exp_model <- lm(log(num_sets_per_year_df_new$num_sets)~ as.integer(num_sets_per_year_df_new$year))
ggplot(num_sets_per_year_df_new, aes(x = year, y = num_sets, group=1)) +
geom_line(color = teal, size=1) +
geom_smooth(method = "lm", formula = y ~ exp(coef(exp_model)[1] + coef(exp_model)[2] * x),
se = FALSE, color=orange) +
scale_x_discrete(breaks = selected_years) +
xlab("Year") +
ylab("Number of Sets") +
ggtitle("Number of Sets per Year") +
theme(plot.background = element_rect(fill=background_color),
plot.title=element_text(size=title_size, colour = font_color),
axis.title.x = element_text(size=label_size, colour = font_color),
axis.title.y = element_text(size=label_size, color = font_color),
axis.text.x = element_text(size=tick_size, color = tick_color),
axis.text.y = element_text(size=tick_size, color = tick_color),
panel.background = element_rect(fill=background_color))
In the next section we will explore the colors data.
Here are the top 100 most frequent colors based on the inventory_parts table quantities:
color_id_sums <- inventory_parts %>% group_by(color_id) %>% summarise(sum = sum(quantity))
color_sums <- inner_join(x=colors, y=color_id_sums, by=join_by(id==color_id))
color_sums <- color_sums %>% mutate(
r = paste("0x", substr(rgb, start=1, stop=2), sep=''),
g = paste("0x", substr(rgb, start=3, stop=4), sep=''),
b = paste("0x", substr(rgb, start=5, stop=6), sep='')
)
top_colors <- color_sums %>% arrange(sum) %>% tail(top_n) %>% mutate(order = seq.int(top_n))
top_colors %>% ggplot(aes(area=sum, fill=factor(order), label=rgb)) +
geom_treemap() +
geom_treemap_text(min.size = 8, size=20, color=c(rep("white", nrow(top_colors)-3), 1, rep("white", 2))) +
scale_fill_manual(values=rgb(top_colors$r, top_colors$g, top_colors$b, maxColorValue=255)) +
ggtitle("Most frequent colors") +
theme(legend.position="none", plot.background = element_rect(fill=background_color),
plot.title=element_text(hjust=.5, size=title_size, colour = font_color))
On the figure we can observe that the most common colors used are different shades of grey. A characteristic tone of yellow, blue and red are also present in the treemap.
trans_count <- colors %>% group_by(is_trans) %>% summarise(count = n()) %>%
mutate(
proportions = paste(round(count/sum(count),2)*100, "%", sep=''),
is_transparent = ifelse(is_trans == 'f', "No", "Yes"))
# No legend
trans_count %>%
ggplot(aes(x="", y=count, fill=is_transparent, label=proportions)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start = 0) +
ggtitle("Proportion of transparent colors") +
theme_void() +
geom_text(nudge_y = c(-45,-19), nudge_x = c(0.0025,0.2), size=9, color=font_color, fontface="bold") +
theme(plot.title = element_text(hjust=0.3, vjust=-1.5, color=font_color, size=title_size),
plot.background = element_rect(fill=background_color),
axis.ticks.length = unit(0, "pt"),
legend.position = "none") +
annotate(geom="text", x=1.9, y=20, label="Transparent", size=7, color=font_color) +
annotate(geom="text", x=1.9, y=125, label="Non-transparent", size=7, color=font_color)